#' ---
#' title: "Agenda Seeding: Carter Protest, County Demo, Electoral and Rainfall Data Combine"
#' date: "`r Sys.Date()`"
#' output: html_document
#' header-includes:
#'  - \usepackage{booktabs}
#'  - \usepackage{longtable}
#'  - \usepackage{array}
#'  - \usepackage{multirow}
#'  - \usepackage{wrapfig}
#'  - \usepackage{float}
#'  - \usepackage{colortbl}
#'  - \usepackage{pdflscape}
#'  - \usepackage{tabu}
#'  - \usepackage{threeparttable}
#'  - \usepackage{threeparttablex}
#'  - \usepackage[normalem]{ulem}
#'  - \usepackage{makecell}
#'  - \usepackage{dcolumn}
#'  - \usepackage{setspace}\doublespacing
#' ---



## ---- data_prep_spin_code, eval = FALSE, include = FALSE ----
# spin code to output Rmd / Rnw
# set output_format to "html_document" for html
# rmarkdown::render(input = here::here("code/carter_protest_data_prep3.R"), output_format = "pdf_document", clean = TRUE)


## ---- data_prep_setup1, include = FALSE ----
library(knitr)
library(here) # checked read., read_, load(, save(, source(


## ---- data_prep_load_packages, include = FALSE ----
# Loading necessary packages

#source(here("code/source_libraries.R"))

library(naniar)
library(estimatr)
library(simputation)

library(conflicted)
conflict_prefer("filter", "dplyr")
conflict_prefer("select", "dplyr")
conflict_prefer("here",   "here")

fiver <- function(x) {
    str_pad(
        string = x,
        width  = 5,
        side   = "left",
        pad    = "0"
    )
}

rse <- function(x) {
  suppressWarnings(robust.se(x)) %>% 
    broom::tidy() %>% 
    pull(std.error) %>% 
    unlist()
}

## ---- data_prep_load_custom_fuctions ----

se.func <- function(xx) {
  cov1 <- vcov(xx)
  se <- sqrt(diag(cov1))
  return(se)
}

kable_format <- case_when(
  knitr::is_latex_output() ~ "latex",
  knitr::is_html_output()  ~ "html",
  TRUE                     ~ "markdown"
)

star_format <- case_when(
  knitr::is_latex_output() ~ "latex",
  knitr::is_html_output()  ~ "html",
  TRUE                     ~ "text"
)




## ---- carter_data_load_scrub, eval = TRUE, include = FALSE ----
# Loading protest dataset to prepare for geocoding
# Only geocoded version used in later analyses
carter <- read_csv(here("data/Carter_data/carter_data.csv")) %>% 
  janitor::clean_names()


# Load state abbreviations from FIPS dataset and cleaning
st <- read.csv(here("data/icpsr_states_fips_census.csv"))

st <- st %>%
  clean_names() %>%
  rename(state = name, 
         st    = stateab) %>% 
  mutate(state = str_to_upper(state))


# Cleaning up some of the city coding in carter dataset
carter$city <- carter$city %>%
  str_replace_all("(FT\\.)", "FORT ") %>%
  str_replace_all("(ST\\.)", "SAINT ") %>%
  str_replace_all("ASHBURY PARK", "ASBURY PARK") %>%
  str_replace_all("SAN BERNADINO", "SAN BERNARDINO") %>%
  str_replace_all("SACREMENTO", "SACRAMENTO") %>%
  str_replace_all("QUEENS\\(JAMAICA \\& SAINT ALBANS\\)", "QUEENS") %>%
  str_replace_all("DISTRICT OF COLUMBIA", "WASHINGTON") %>%
  str_replace_all("  ", " ") %>%
  as.character()

# Do same for state data
carter$state <- carter$state %>%
  str_replace_all("WASHINGTON, D.C", "DISTRICT OF COLUMBIA")

# Original 'cityid' variable in doc had no '546' but 'POMONA' was coded
#   '545', same as the above entry 'PALO ALTO' so assuming 'POMONA' is '546'
carter$city[which(carter$cityid == 546)] <- "POMONA"

# Fixing two rows that have miscoded city or states (corrections are guesses)
bad.data <- which(carter$city == "SACRAMENTO" & carter$state == "INDIANA")
carter$state[bad.data]   <- "CALIFORNIA"
carter$stateid[bad.data] <- 5

bad.data <- which(carter$city == "NYACK" & carter$state == "ILLINOIS") # no such place in google maps
carter$state[bad.data] <- "NEW YORK" # assume NY
carter$stateid[bad.data] <- 32

# Adding Attica, for prison uprising
attica          <- carter[which(carter$state == "NEW YORK"), ][1, ]
attica$cityid   <- 3256 # invented number, first free in NY CITYID
attica$obs      <- 800 # invented,
attica$caseid   <- 543.5 # happens between CASEID 543 and 544
attica$day      <- 09
attica$month    <- 09
attica$year     <- 71
attica$newarr   <- 1000 # Estimated 1000 *already arrested* prisoners participate. Best measure of severity is 43 killed (same as Detroit, a max across all events) 
attica$newarson <- 0
attica$culmriot <- 1
attica$daysriot <- 4
attica$newinj   <- 36 # nyt reports scores injured
attica$killed   <- 43
attica$factsev  <- NA
attica$city     <- "ATTICA"
attica$state    <- "NEW YORK"


# view data
attica

# append
carter <- rbind(carter, attica)

# Creating carter dataset with state abbrevations
carter2       <- left_join(carter, st, by = "state")

dim(carter2) # 753

# convert some cols to character
carter2 <- carter2 %>% 
  mutate_at(vars(city, st, state), as.character)

# identify five violent protests in florida without cities
carter2 %>% 
  filter(is.na(city)) %>% 
  select(stateid, cityid, caseid, year, newarr)

# Some rows are missing 'city' and have NAs, removing
carter2 <- carter2 %>% filter(is.na(city) == FALSE)
dim(carter2) # 747

# Fixing upper and lowercase spelling to allows for correct matching
carter2$city   <- str_to_title(carter2$city)
carter2$cityst <- paste(carter2$city, ", ", carter2$st, sep = "")

## ---- load_carter_geocodes, include = FALSE ----

# geocodes created via batchgeo
carter_geo <- read.csv(here("data/Carter_data/carter_geocodes.csv"), header = TRUE) 
carter3    <- left_join(carter2, 
                        carter_geo %>% select(obs, bg_lat, bg_long), 
                        by = c("obs")) 

# build date
carter3    <- carter3 %>%
  mutate(
    year    = paste0("19", year) %>% as.numeric(),
    date    = paste0(day, "/", month, "/", year) %>%
      as.Date(format = "%d/%m/%Y")
  )

# manually add geocodes for two observations 
carter3[which(carter3$city == "Pomona"), c("bg_lat", "bg_long")] <-
  c(34.0608, 117.7558)
carter3[which(carter3$city == "Attica"), c("bg_lat", "bg_long")] <-
  c(42.8642267,-78.2803)


write_csv(carter3, path = here("data/Carter_data/carter_violent_protests_geocoded.csv"), na = "")




## ---- merge_census_electoral_data, include = FALSE ----

# load data from scratch, once earlier scrubbing completed
r4 <- read_csv(file = here("data/Carter_data/carter_violent_protests_geocoded.csv"))

# drop obs missing lat lon 
r4 <- r4 %>% filter(!(is.na(bg_lat)))

# load county demographic data
load(file = here("data/county3_demo.Rdata"), verbose = TRUE)

# load geocoded county voting data
load(file = here("data/electoral_4472_geocoded_panel.Rdata"), verbose = TRUE)

# merge county demographic and voting data
# because join includes year, subsets for 1964-1972
vc <- left_join(county3, e8, by = c("fips", "fips_st", "fips_cty", "year"))

#vc %>% head() %>% View()

# reorder columns
vc <- vc %>% select(county, year, st, fips,  everything())


## ---- set_global_params, include = FALSE ----

dist_thresh <- 100
time_thresh <- 365 * 2
part_thresh <- 10

# identify the election days of the 1964 - 1972 elections
elec64 <- as.Date("1964-11-03") ## 1964 Pres/Cong election
elec68 <- as.Date("1968-11-05") ## 1968 Pres/Cong election
elec72 <- as.Date("1972-11-07") ## 1972 Pres/Cong election

# Congressional elections not used
# elec66 <- as.Date("1966-11-01") ## 1966 Congressional election
# elec70 <- as.Date("1970-11-03") ## 1970 Congressional election

n <- nrow(vc)/3

# create dummy indices for county-year rows
# first row = 64, second = 68, third = 72
d64 <- rep(c(TRUE,  FALSE, FALSE), n)
d68 <- rep(c(FALSE, TRUE,  FALSE), n)
d72 <- rep(c(FALSE, FALSE, TRUE),  n)

## ---- merge_carter_protest_data_distance, include = FALSE ----

# create county x protest distance matrix
dist_matrix <- rdist.earth(
    vc[vc$year == 1968, c("bg_long", "bg_lat")] %>% as.data.frame(), 
    r4[, c("bg_long", "bg_lat")])

dim(dist_matrix)

# create simple distance threshold function
dist_func   <- function(x) {x <= dist_thresh}

# with distance matrix, 
dist_treat  <- map_df(.x = dist_matrix %>% as.data.frame(), 
                      .f = function(x) dist_func(x) )

# save county-carter protest distance matrix
carter_dist_matrix <- dist_matrix
save(carter_dist_matrix, file = here("data/carter_county_dist_matrix.Rdata"))


## ---- merge_carter_protest_data_time, include = FALSE ----

time_func <- function(x) {
    # calc whether event is between time_thresh days on lower bound 
    # and zero days (election day) on upper bound
    t64 <- -time_thresh <= (x - elec64) & (x - elec64) <= 0 
    t68 <- -time_thresh <= (x - elec68) & (x - elec68) <= 0
    t72 <- -time_thresh <= (x - elec72) & (x - elec72) <= 0 
    return(data.frame(t64, t68, t72)
    )
}

# across all carter event dats, calc time treatment
time_treat <- map_df(.x = r4[, "date", drop = FALSE], 
                     .f = function(x) time_func(x) )

#dim(time_treat)

## ---- merge_carter_protest_data_arrests, include = FALSE ----

# calc protest effect with matrix multiplication
car_64_df <- t(t(dist_treat) * (r4$newarr >= part_thresh) * time_treat[, "t64"]) 
car_68_df <- t(t(dist_treat) * (r4$newarr >= part_thresh) * time_treat[, "t68"]) 
car_72_df <- t(t(dist_treat) * (r4$newarr >= part_thresh) * time_treat[, "t72"]) 

# protest sum
car_64_sum <- rowSums(car_64_df, na.rm = TRUE)
car_68_sum <- rowSums(car_68_df, na.rm = TRUE)
car_72_sum <- rowSums(car_72_df, na.rm = TRUE)

# carter protest binary
car_64_bin <- (car_64_sum > 0) %>% as.numeric()
car_68_bin <- (car_68_sum > 0) %>% as.numeric()
car_72_bin <- (car_72_sum > 0) %>% as.numeric()

car_bin <- rep(NA, n * 3)
car_bin[d64] <- car_64_bin
car_bin[d68] <- car_68_bin
car_bin[d72] <- car_72_bin

vc$car_bin <- car_bin



## ---- calc_carter_mlk_protest_effect, include = FALSE ----

# did protest occur in April 1968?
time_68_mlk    <- (r4$month == 4) & (r4$year == 1968)

# protest effect 
car_68_mlk_df  <- t(t(dist_treat) * (r4$newarr >= part_thresh) * time_68_mlk) 

# protest sum
car_68_mlk_sum <- rowSums(car_68_mlk_df, na.rm = TRUE)

# carter protest binary
car_68_mlk_bin   <- (car_68_mlk_sum > 0) %>% as.numeric()

car_mlk_bin      <- rep(NA, n * 3)
car_mlk_bin[d68] <- car_68_mlk_bin

vc$car_mlk_bin   <- car_mlk_bin



## ---- rainfall_prep, include = FALSE ----

load(here("data/rainfall_geocoded.Rdata"), verbose = TRUE )


## ---- rainfall_instrument_data_updated, include = FALSE ----

rain_dist <- rdist.earth(
    vc[, c("bg_long", "bg_lat")] %>% as.data.frame(), 
    w4[, c( "lon", "lat")]
    )

dim(rain_dist)
class(rain_dist)

# what are min distances to at least one weather station: 36 miles
which_min_dis <- apply(rain_dist, 1, which.min) 


# identify weather stations within radius of county centroid
which_below_dis <- apply(rain_dist, 1, FUN = function(x) { which(x < 50) } ) 

# at 50, every county with a station has at least two for averaging
ll <- map_dbl(which_below_dis, length)
table(ll)


# calc average for Apr 1-3
prethreeday <-
  map_dbl(
    .x = 1:nrow(rain_dist),
    .f = function(x) {
      (w4$prethreeday[which_below_dis[[x]]]) %>% mean(na.rm = TRUE)
    }
  )

# calc average for Apr 4-11
rain_week <-
  map_dbl(
    .x = 1:nrow(rain_dist),
    .f = function(x) {
      (w4$rain_week[which_below_dis[[x]]]) %>% mean(na.rm = TRUE)
    }
  )

# calc average for Apr 12-30
rain_month_placebo <-
  map_dbl(
    .x = 1:nrow(rain_dist),
    .f = function(x) {
      (w4$rain_month[which_below_dis[[x]]]) %>% mean(na.rm = TRUE)
    }
  )

# check results
which_below_dis[[1]]
summary(prethreeday)
summary(rain_week)
summary(rain_month_placebo)

length(which_min_dis)
#View(which_min_dis)
class(which_min_dis)

which_min_dis_num <- which_min_dis %>% as.numeric()


# diagnostic: how far are weather stations from county center
# median shortest distance is 11 miles, maximum shortest distance is 36 miles
min_dist <- apply(rain_dist, 1, min)
length(min_dist)
summary(min_dist)
hist(min_dist)


# combine county data with mean rainfall within radius
vc2 <- bind_cols(vc, data.frame(prethreeday, rain_week, rain_month_placebo))



## ---- save_combined_data, include = FALSE ----

save(vc2, file = here("data/voting_census_rain.Rdata"))


## ---- create_codebook, eval = FALSE, include = FALSE ----

dataMaid::makeCodebook(
  vc2, 
  file = here("codebooks/codebook_carter_protest_data_combined.Rmd"), 
  #reportTitle = "Carter Protest Data Combined"
  checks = list(character = NULL, factor = NULL), 
  replace = TRUE
  )

